suppressMessages(library(data.table))
suppressMessages(library(tidyverse))
suppressMessages(library(splines))
rain <- fread("airport_data/IDCJAC0009_009021_1800_Data.csv")
rain <- rain %>% drop_na("Rainfall amount (millimetres)")
min_year = rain$Year %>% min
max_year = rain$Year %>% max
rainfall_totals <- rain %>%
group_by(Year, Month) %>%
summarise(total_rainfall = sum(`Rainfall amount (millimetres)`))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
rainfall_totals %>%
filter(Year != 1944, Year != 2022) %>%
ggplot(aes(Year, total_rainfall)) +
facet_grid(cols=vars(Month)) +
geom_point(alpha = 0.5) +
geom_smooth(method="lm", colour = "red") +
theme_classic() +
theme(axis.text.x=element_text(angle=60,hjust=1)) +
scale_x_continuous(breaks = c(min_year+1, max_year-1)) +
ggtitle("Perth rainfall monthly totals for each month between 1945 and 2021")
## `geom_smooth()` using formula = 'y ~ x'
#ggsave("monthly_totals_regressed_year_as_predictor.png")
rainfall_totals %>%
filter(Year != 1944, Year != 2022) %>%
filter(Month > 4, Month <= 9) %>%
ggplot(aes(Year, total_rainfall)) +
facet_grid(cols=vars(Month)) +
geom_point(alpha = 0.5) +
geom_smooth(method="loess", colour = "red") +
theme_classic() +
theme(axis.text.x=element_text(angle=60,hjust=1)) +
scale_x_continuous(breaks = c(min_year+1, max_year-1)) +
ggtitle("Perth rainfall monthly totals for each month between 1945 and 2021")
## `geom_smooth()` using formula = 'y ~ x'
rainfall_totals %>%
filter(Year != 1944, Year != 2022) %>%
filter(Month < 4 | Month > 10) %>%
ggplot(aes(Year, total_rainfall)) +
facet_grid(cols=vars(Month)) +
geom_point(alpha = 0.5) +
geom_smooth(method="loess", colour = "red") +
theme_classic() +
theme(axis.text.x=element_text(angle=60,hjust=1)) +
scale_x_continuous(breaks = c(min_year+1, max_year-1)) +
ggtitle("Perth rainfall monthly totals for each month between 1945 and 2021")
## `geom_smooth()` using formula = 'y ~ x'
June vs July - years when july is wetter?
rainfall_totals_june_july_higher <- rainfall_totals %>%
filter(Month > 5, Month < 8) %>%
#cols become Year june july
pivot_wider(names_from = c(Month), values_from = total_rainfall) %>%
mutate(higher_rain = case_when(
`6` > `7` ~ "June",
`7` > `6` ~ "July",
)) %>%
mutate(higher_rain_ordinal = case_when(
higher_rain == "June" ~ 0,
higher_rain == "July" ~ 1
)) %>%
mutate(delta = `7` - `6`)
rainfall_totals_june_july_higher %>%
ggplot(aes(Year, higher_rain_ordinal)) +
geom_point()
## Warning: Removed 1 rows containing missing values (`geom_point()`).
rainfall_totals_june_july_higher %>%
ggplot(aes(Year, delta)) +
geom_point() +
geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.8, colour = "red") +
geom_smooth(method="glm") +
ggtitle("Difference in total rainfall in July compared to June (Perth Airport, BOM data)") +
ylab("Difference (mm)") +
theme_bw() +
ylim(-400,400) +
annotate("text", label = "July", x = 1980, y = 300, size = 8) +
annotate("text", label = "June", x = 1980, y = -300, size = 8)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Removed 1 rows containing missing values (`geom_point()`).
#ggsave("Difference in total rainfall in July compared to June.png")
m.rainfall_totals_june_july_higher <- glm(delta ~ Year, data = rainfall_totals_june_july_higher)
summary(m.rainfall_totals_june_july_higher)
##
## Call:
## glm(formula = delta ~ Year, data = rainfall_totals_june_july_higher)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -272.34 -46.08 1.90 44.35 403.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1859.4794 923.1161 -2.014 0.0475 *
## Year 0.9385 0.4656 2.016 0.0474 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 8571.591)
##
## Null deviance: 686268 on 77 degrees of freedom
## Residual deviance: 651441 on 76 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 931.71
##
## Number of Fisher Scoring iterations: 2
rainfall_totals_july_august_higher <- rainfall_totals %>%
filter(Month > 6, Month < 9) %>%
#cols become Year june july
pivot_wider(names_from = c(Month), values_from = total_rainfall) %>%
mutate(higher_rain = case_when(
`7` > `8` ~ "July",
`8` > `7` ~ "August"
)) %>%
mutate(higher_rain_ordinal = case_when(
higher_rain == "July" ~ 0,
higher_rain == "August" ~ 1
)) %>%
mutate(delta = `8` - `7`)
rainfall_totals_july_august_higher %>%
ggplot(aes(Year, delta)) +
geom_point() +
geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.8, colour = "red") +
geom_smooth(method="glm") +
ggtitle("Difference in total rainfall in August compared to July (Perth Airport, BOM data)") +
ylab("Difference (mm)") +
theme_bw() +
ylim(-400,400) +
annotate("text", label = "August", x = 1980, y = 300, size = 8) +
annotate("text", label = "July", x = 1980, y = -300, size = 8)
## `geom_smooth()` using formula = 'y ~ x'
#ggsave("Difference in total rainfall in August compared to July")
m.rainfall_totals_july_august_higher <- glm(delta ~ Year, data = rainfall_totals_july_august_higher)
summary(m.rainfall_totals_july_august_higher)
##
## Call:
## glm(formula = delta ~ Year, data = rainfall_totals_july_august_higher)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -351.30 -46.22 -3.07 52.36 225.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -750.8180 862.5827 -0.870 0.387
## Year 0.3597 0.4351 0.827 0.411
##
## (Dispersion parameter for gaussian family taken to be 7484.283)
##
## Null deviance: 573920 on 77 degrees of freedom
## Residual deviance: 568806 on 76 degrees of freedom
## AIC: 921.13
##
## Number of Fisher Scoring iterations: 2
To get stats for these 12 regressions means
Do some basic sensitivity tests:
test_july = rainfall_totals %>% filter(Month == 7)
july_rainfall_vs_year <- lm(total_rainfall ~ Year, data = test_july)
july_rainfall_vs_year %>% summary()
##
## Call:
## lm(formula = total_rainfall ~ Year, data = test_july)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.487 -34.285 -1.097 32.599 278.202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1198.6345 577.9313 2.074 0.0415 *
## Year -0.5261 0.2915 -1.805 0.0751 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.96 on 76 degrees of freedom
## Multiple R-squared: 0.0411, Adjusted R-squared: 0.02848
## F-statistic: 3.258 on 1 and 76 DF, p-value: 0.07506
plot(july_rainfall_vs_year)
test_dec = rainfall_totals %>% filter(Month == 12)
dec_rainfall_vs_year <- lm(total_rainfall ~ Year, data = test_dec)
dec_rainfall_vs_year %>% summary()
##
## Call:
## lm(formula = total_rainfall ~ Year, data = test_dec)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.522 -7.378 -3.270 2.290 59.489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 222.34162 131.43143 1.692 0.0948 .
## Year -0.10663 0.06629 -1.608 0.1119
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.18 on 76 degrees of freedom
## Multiple R-squared: 0.03292, Adjusted R-squared: 0.0202
## F-statistic: 2.587 on 1 and 76 DF, p-value: 0.1119
plot(dec_rainfall_vs_year)
predict(dec_rainfall_vs_year, list(Year = 2021), interval = "confidence")
## fit lwr upr
## 1 6.844791 0.9561971 12.73338
summary(dec_rainfall_vs_year)$coefficients[2,]
## Estimate Std. Error t value Pr(>|t|)
## -0.10662881 0.06629153 -1.60848322 0.11187595
I can make this set the intercept to the avg in 1945 by substracting 1945 from each year before doing the regression.
models.year_vs_rainfall_per_month <- rainfall_totals %>%
mutate(Year = Year - 1945) %>%
group_by(Month) %>%
group_modify(~ broom::tidy(lm(total_rainfall ~ Year, data = .)))
models.year_vs_rainfall_per_month
## # A tibble: 24 × 6
## # Groups: Month [12]
## Month term estimate std.error statistic p.value
## <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 (Intercept) 2.37 4.37 0.542 5.90e- 1
## 2 1 Year 0.216 0.0979 2.20 3.07e- 2
## 3 2 (Intercept) 12.6 5.94 2.12 3.72e- 2
## 4 2 Year 0.0616 0.133 0.463 6.45e- 1
## 5 3 (Intercept) 13.3 3.83 3.48 8.41e- 4
## 6 3 Year 0.0758 0.0859 0.882 3.80e- 1
## 7 4 (Intercept) 48.8 6.56 7.44 1.32e-10
## 8 4 Year -0.232 0.147 -1.57 1.20e- 1
## 9 5 (Intercept) 121. 10.8 11.2 8.15e-18
## 10 5 Year -0.618 0.244 -2.53 1.34e- 2
## # … with 14 more rows
models.year_vs_rainfall_per_month %>%
filter(term == "Year") %>%
ggplot(aes(Month, estimate, ymin=estimate-std.error, ymax=estimate+std.error, colour = -log10(p.value))) +
theme_classic() +
geom_point() +
geom_pointrange() +
ggtitle("Estimated change in average total monthly rainfall per year") +
ylab("estimated change rainfall (mm)") +
scale_x_continuous(breaks = 1:12)
#ggsave("Estimated_change_in_average_total_monthly_rainfall_per_year.png")
models.year_vs_rainfall_per_month %>%
filter(term == "(Intercept)") %>%
ggplot(aes(Month, estimate, ymin=estimate-std.error, ymax=estimate+std.error, colour = -log10(p.value))) +
geom_point() +
geom_pointrange() +
ggtitle("Estimate for mean rainfall at Perth Airport in 1945 (intercept estimates)") +
theme_classic()
2021 estimates? - 2021 will be 1945_total + (2021-1945)*beta_month
models.year_vs_rainfall_per_month %>%
pivot_wider(names_from = c(term), values_from = c("estimate","std.error","statistic","p.value")) %>%
mutate(estimate_2021 = `estimate_(Intercept)` + 76*`estimate_Year`) %>%
ggplot(aes(Month, estimate_2021, colour = -log10(p.value_Year))) +
geom_point() +
ggtitle("Estimate for mean rainfall at Perth Airport in 2021") +
theme_classic()
Maybe easier to use predict() on model object to get
2021 estimates and std.err. I have re-done the regression without
subtracting year as it is simpler to put in
predict(year = 2021) than as
predict(year = 2021-1945).
Years <- c(1945,1970,2021)
models.year_vs_rainfall_per_month.predictions.CI <- rainfall_totals %>%
group_by(Month) %>%
group_modify(
~ as.data.frame(predict(
lm(total_rainfall ~ Year, data = .), list(Year = Years), interval = "confidence"
))) %>%
mutate(Year = Years)
models.year_vs_rainfall_per_month.predictions.CI %>%
ggplot(aes(Month, fit, colour = Year, group = Year)) +
geom_point() + geom_line() +
theme_classic() +
ggtitle("Monthly average total rainfall estimate predicted from per month model\nfor 1945, 1970, and 2021 (Perth Airport)") +
ylab("Estimated rainfall total (mm)") +
scale_x_continuous(breaks = 1:12)
#ggsave("monthly_total_rain_est_predicted_from_per_month_models.png")
Oh cool year is significant here! All/most months are significant too. Note January is baseline here. Hence all estimates are positive. Significance of other months depends on which month is used as the default month here. Since summer months are similar to each other it is unsurprising that they have small relative differences to each other.
#need Month as factor for this to work
rainfall_totals_cat_month <- rainfall_totals %>%
mutate(Month = as.factor(Month)) %>%
mutate(Month = fct_relevel(Month, as.character(1:12)))
month_predict_rainfall <- lm(total_rainfall ~ Month + Year, data = rainfall_totals_cat_month)
summary(month_predict_rainfall)
##
## Call:
## lm(formula = total_rainfall ~ Month + Year, data = rainfall_totals_cat_month)
##
## Residuals:
## Min 1Q Median 3Q Max
## -141.718 -17.390 -4.492 16.877 284.929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 509.67014 112.51788 4.530 6.68e-06 ***
## Month2 4.30256 6.27236 0.686 0.4929
## Month3 5.57436 6.27236 0.889 0.3744
## Month4 29.21923 6.27236 4.658 3.65e-06 ***
## Month5 86.68607 6.25255 13.864 < 2e-16 ***
## Month6 141.74050 6.25255 22.669 < 2e-16 ***
## Month7 144.69329 6.27262 23.067 < 2e-16 ***
## Month8 106.90996 6.27262 17.044 < 2e-16 ***
## Month9 61.29329 6.27262 9.772 < 2e-16 ***
## Month10 32.39201 6.27262 5.164 2.96e-07 ***
## Month11 15.08047 6.27262 2.404 0.0164 *
## Month12 0.03560 6.27262 0.006 0.9955
## Year -0.25158 0.05668 -4.438 1.02e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.17 on 925 degrees of freedom
## Multiple R-squared: 0.6488, Adjusted R-squared: 0.6442
## F-statistic: 142.4 on 12 and 925 DF, p-value: < 2.2e-16
LOL this looks awful! 🤣 I guess Month as a categorical variable is problematic here.
plot(month_predict_rainfall)
Try month as a numeric value? Well this returns a similar value for Year at least. Diagnostic plots still show assumptions very violated.
month_predict_rainfall_monthnumeric <- lm(total_rainfall ~ Month + Year, data = rainfall_totals)
summary(month_predict_rainfall_monthnumeric)
##
## Call:
## lm(formula = total_rainfall ~ Month + Year, data = rainfall_totals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81.97 -49.80 -23.40 36.58 375.96
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 575.9180 187.1259 3.078 0.00215 **
## Month 1.7696 0.6171 2.868 0.00423 **
## Year -0.2643 0.0943 -2.803 0.00517 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65.17 on 935 degrees of freedom
## Multiple R-squared: 0.01721, Adjusted R-squared: 0.01511
## F-statistic: 8.188 on 2 and 935 DF, p-value: 0.0002983
plot(month_predict_rainfall_monthnumeric)
rain <- rain %>%
mutate(did_rain = `Rainfall amount (millimetres)` > 0)
rainy_days <- rain %>%
group_by(Month, Day) %>%
summarise(number_of_rainy_days = sum(did_rain), days_observed = n()) %>%
mutate(proportion_rained = number_of_rainy_days/days_observed)
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
rainy_days
## # A tibble: 366 × 5
## # Groups: Month [12]
## Month Day number_of_rainy_days days_observed proportion_rained
## <int> <int> <int> <int> <dbl>
## 1 1 1 5 78 0.0641
## 2 1 2 4 78 0.0513
## 3 1 3 6 78 0.0769
## 4 1 4 3 78 0.0385
## 5 1 5 8 78 0.103
## 6 1 6 4 78 0.0513
## 7 1 7 6 78 0.0769
## 8 1 8 8 78 0.103
## 9 1 9 5 78 0.0641
## 10 1 10 7 78 0.0897
## # … with 356 more rows
rainy_days %>%
ggplot(aes(x=Day, y=proportion_rained)) +
geom_point() + facet_grid(cols=vars(Month), space="free") +
theme_classic() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
How can I get this to go across months continously?
rainy_days %>%
ggplot(aes(x=Month, y=proportion_rained, group = factor(Month))) +
geom_boxplot()
rain %>%
filter(Month == 6) %>%
ggplot(aes(Day, `Rainfall amount (millimetres)`)) +
geom_point()
rainy_days %>%
filter(Month == 6) %>%
ggplot(aes(Day, proportion_rained)) +
geom_point()
rainy_days %>%
filter(Month == 7) %>%
ggplot(aes(Day, proportion_rained)) +
geom_point()
rainy_days %>%
filter(Month == 8) %>%
ggplot(aes(Day, proportion_rained)) +
geom_point()
Combine month and day as a single axis? Yes, can do this as a simple index.
Can then add days and month as x axis?
rainy_days
## # A tibble: 366 × 5
## # Groups: Month [12]
## Month Day number_of_rainy_days days_observed proportion_rained
## <int> <int> <int> <int> <dbl>
## 1 1 1 5 78 0.0641
## 2 1 2 4 78 0.0513
## 3 1 3 6 78 0.0769
## 4 1 4 3 78 0.0385
## 5 1 5 8 78 0.103
## 6 1 6 4 78 0.0513
## 7 1 7 6 78 0.0769
## 8 1 8 8 78 0.103
## 9 1 9 5 78 0.0641
## 10 1 10 7 78 0.0897
## # … with 356 more rows
rainy_days$index <- 1:length(rainy_days$Day)
rainy_days <- rainy_days %>% mutate(date_str = str_c(Day, Month, sep = "/"))
#custom_breaks = seq(1,366,31)
custom_breaks = c(
1,32,32+29,32+29+31,92+30,153,153+30,214,245,275,306,336,366
)
rainy_days %>%
ggplot(aes(index, proportion_rained*100)) +
geom_point() +
geom_smooth(span=0.5) +
scale_y_continuous(breaks = seq(0,1,0.1)*100) +
ylab("Percentage of days with any rain") +
xlab("Date") +
scale_x_continuous(breaks = custom_breaks, labels = rainy_days$date_str[custom_breaks]) +
ggtitle("Should you plan to go outside? Percent chance of any rain for Perth Airport") +
theme_bw()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#ggsave("Percent chance of any rain for Perth Airport.png", width = 10, height = 7)
m.loess <- loess(proportion_rained ~ index, data = rainy_days, span=0.5)
X <- rainy_days$index
Y <- predict(m.loess, X)
plot(X, rainy_days$proportion_rained, xlab = "day", ylab = "proportion (0 to 1)", main="proportion of rainy days (Perth Airport)")
points(X,Y, type = "l")
dY <- diff(Y)/diff(X)
dX <- rowMeans(embed(X,2))
plot(dX, abs(dY), type="l", xlab = "day", ylab = "absolute rate of change", main = "rate of change in proportion of rainy days (Perth Airport)")